Code
# libraries
library(dplyr)
library(ggplot2)
library(forcats)
# source functions
names_functions = list.files(here::here("functions"))
for (f in names_functions)
source(here::here("functions", f))
rm(f, names_functions)This notebook constructs the priors for the model estimation. See model_description.htmlfor details. Some of the priors are independent of the specific scenario being estimated, e.g. the innovations to the reverse random-walk \boldsymbol{W}; others depend on the scenario and the resulting fundamental forecast, e.g. the prior mean m_{\mu_T} and variance V_{\mu_T} on the (transformed) latent voting intentions on election day.
# libraries
library(dplyr)
library(ggplot2)
library(forcats)
# source functions
names_functions = list.files(here::here("functions"))
for (f in names_functions)
source(here::here("functions", f))
rm(f, names_functions)Initialize empty list to store priors for different scenarios
priors <- list() Load states
states <- load_states()Load number of parties running in each state -> needed to set the prior on \mu_T
n_parties_by_state <- load_n_parties_by_geography("state")Load election results -> needed to estimate \boldsymbol{W}
election_results <- load_election_vote_shares()
n_elections <- length(unique(election_results$year))Load electoral votes -> used in simulating the prior probabilities of winning the election
electoral_votes <- load_electoral_votes()Calculate the dim names of \mu_T, i.e. the state and party that each entry refers to. These are also the dimnames of \boldsymbol{W} and V_{\mu_T}. These serve as a cross-check that the priors are ordered in the same way as in the data preparation and estimation!
names_mmu_T <- c()
parties <- load_parties()
for (s in seq(1, length(n_parties_by_state))) {
tmp <- paste0(names(n_parties_by_state)[s], "_")
for (p in seq(1, n_parties_by_state[s] - 1)) {
names_mmu_T <- c(names_mmu_T, paste0(tmp, parties[p]))
}
}Load the list of states in which historically one party has been the clear winner
dominated_states <- readRDS(here::here("fundamental_forecast", "dominated_states.rds"))To specify the “prior” on the innovations to the reverse random-walk, it is useful to decompose the covariance as
\boldsymbol{W} = \kappa \times \boldsymbol{\hat{W}}
where \boldsymbol{\hat{W}} is a correlation matrix and \kappa a scale factor. Intuitively, \boldsymbol{\hat{W}} accounts for the comovement of the changes in latent voting intentions across parties and states while \kappa governs how large changes in voting intentions are!
To estimate \boldsymbol{\hat{W}} I consider the correlation of historical election outcomes across parties and states. These correlations are far from a perfect measure of what I am after because there is no variation in vote shares/intentions within an election campaign. But using the historic results has the advantage of being straight-forward to calculate!
An additional source of information could be the correlation across parties and states of polls from previous elections but it’s less obvious to me how these would need to be handled. Aggregated over time in some way? Does it make sense to compare a poll in Amperville on May 5th in 1984 with one in Circuiton on June 7th in 2023? What does that say about the evolution of the underlying voting intentions from day to day? Also, polls don’t just track the voting intentions but also noise like house effects. Although these might be constant from day to day so actual movement in polls would be down to changes in voting intentions. On the whole, I am not sure how useful these data are or at least how I could extract useful information from them for my goal.
Lastly, demographic information like ethnicity could be useful in specifying the correlation of changes in voting intentions between states. But similarities or differences in ethnicities across states don’t necessarily reveal anything about correlations between parties.
When calculating the correlation of latent vote share across states and parties, I use all the available historical election results since
[…] although parties’ vote shares in each province oscillate from year to year—possibly for quantifiable reasons like the strength of the economy, and possibly for unquantifiable ones like the comparative appeal of their candidates or of their proposed policies—the baseline popularity of each party in each province is constant all the way from 1984 to 2024, as are the characteristics of each pollster. There is no equivalent in Dataland of, say, West Virginia in the United States shifting over time from a reliably Democratic-voting state to a reliably Republican-voting one, because there are no long-run time trends. As a result, when predicting elections in 2024, you should treat historical results and polls from 1984 as being just as informative as those from 2023 are. (background material on Dataland, my emphasis)
Plot historical election results
election_results %>%
tidyr::pivot_longer(
cols = c("CC", "DGM", "PDAL", "SSP"),
names_to = "party",
values_to = "vote_share"
) %>%
filter(vote_share != 0) %>%
ggplot(aes(
x = year,
y = vote_share,
color = party
)
) +
geom_point() +
facet_wrap(~state) +
ggsci::scale_color_jco()# calculate correlation acroos elections
mat_res <- calc_cor_election_results(election_results)
corr_mat <- cor(mat_res)Given the large amount of parameters I estimate and the relatively short sample, I also consider a James-Stein-type shrinkage estimator of the correlation matrix. The degree of shrinkage is estimated from the data (see the documentation of cor.shrink). Shrinking the correlation coefficients may improve the performance of the model by dampening the estimation uncertainty.
corr_mat_shrink <- corpcor::cor.shrink(mat_res)Estimating optimal shrinkage intensity lambda (correlation matrix): 0.2019
# manually convert to matrix class first
class(corr_mat_shrink) <- "matrix"Use shrunk correlation matrix!
W_hat <- corr_mat_shrinkkappa <- 0.1W <- kappa * W_hat
priors[["A"]][["W"]] <-
priors[["B"]][["W"]] <-
priors[["C"]][["W"]] <-
priors[["D"]][["W"]] <-
priors[["E"]][["W"]] <- WNote that there is a discrepancy between how Stoetzer et al. (2019) set the value in the code - 0.001 - and the accompanying comment (“1 percent point sd”). In addition, this also differs from what the say about the prior in the original paper on page 258! There they mention that \delta_{c,p} \sim \mathcal{N}(0, 1) which refers to the log ratio transformed house effects!
sig_ddelta <- 0.005 # specify in terms of standard deviation for Stan!Store in list
priors[["A"]][["sig_ddelta"]] <-
priors[["B"]][["sig_ddelta"]] <-
priors[["C"]][["sig_ddelta"]] <-
priors[["D"]][["sig_ddelta"]] <-
priors[["E"]][["sig_ddelta"]] <- sig_ddeltascenarios <- load_scenarios()Load fundamental forecast
df_fcast <- readRDS(here::here("fundamental_forecast", "fundamental_forecast.rds"))Loop over scenarios, convert to log ratio scale and store in list
for (scen in scenarios) {
df_fcast %>%
inner_join(
load_dataland_states_regions(),
by = join_by(province == state)) %>%
filter(scenario == scen) %>%
select(-scenario) %>%
mutate(party = toupper(party)) %>%
group_by(province) %>%
mutate(vote_share_lr = additive_log_ratio(vote_share)) %>%
filter(vote_share_lr != 0) %>%
arrange(region, province) %>%
tidyr::unite(
name,
province,
party,
sep = "_") -> df_m_mmu_T
# check calc
stopifnot(all(df_m_mmu_T$name == names_mmu_T))
# store in list
priors[[scen]][["m_mmu_T"]] <- df_m_mmu_T$vote_share_lr
names(priors[[scen]][["m_mmu_T"]]) <- names_mmu_T
rm(df_m_mmu_T)
}The prior variance can vary across scenarios to reflect how far advanced the campaign is.
In addition, it can also be set to a smaller value - placing greater weight on the fundamental forecast - in those states where within a given scenario fewer or no polls are available
Or it could also be made much tighter to observe the fact that in previous elections (not used in the estimation!) there were certain parties clearly dominating in some states and that in the absence of further evidence this dominance seems like to hold.
Scale elements of V_{\mu_T} down if party is “dominating”
var_V_mmu_T <- 1
scale_dominated_states <- 0.2
diag_V_mmu_T <- c()
for (s in states) {
if (s %in% dominated_states) {
diag_V_mmu_T <- c(
diag_V_mmu_T,
rep(
scale_dominated_states * var_V_mmu_T,
n_parties_by_state[s] - 1)
)
} else {
diag_V_mmu_T <- c(
diag_V_mmu_T,
rep(var_V_mmu_T, n_parties_by_state[s] - 1)
)
}
}
V_mmu_T <- diag(diag_V_mmu_T)
dimnames(V_mmu_T) <- list(names_mmu_T, names_mmu_T)Given differences in data availability across scenarios, the prior could be made even tighter. For the time being, however, I don’t let the tightness of the prior vary:
scale_scenarios <- c(
"A" = 1,
"B" = 1,
"C" = 1,
"D" = 1,
"E" = 1
)Store in list
for (scen in scenarios) {
priors[[scen]][["V_mmu_T"]] <- V_mmu_T * scale_scenarios[scen]
}saveRDS(priors, file = here::here("priors", "priors.Rds"))
rm(priors)To evaluate if the joint prior distribution specified above translates into reasonable prior beliefs for the expected voting intentions or observed polls, I draw independent samples of the marginal priors in the different scenarios and simulate the implied expected vote shares and house effects.
This is similar in spirit to the prior predictive distribution which samples the data conditional on the prior. For definition and background see one’s favorite textbook on Bayesian statistics, Wiki or Stan docs.
Three questions are of interest:
does the joint prior imply reasonable ranges for the expected vote share both across parties and across time?
is the size of the house effects plausible?
what does the joint prior imply for the a priori win probabilities of the different elections?
priors <- readRDS(file = here::here("priors", "priors.Rds"))n_draws <- 250
n_periods <- length(
load_dates_election_campaign()
)Function to obtain a draw from the joint prior
simulate_mmu_ppi <- function(
n_periods,
m_mmu_T,
V_mmu_T,
W,
sig_ddelta
) {
# draw mmu_T
mmu_T <- MASS::mvrnorm(
1,
mu = m_mmu_T,
Sigma = V_mmu_T
)
# draw mmu
W <- W
mmu <- matrix(
NA,
nrow = length(mmu_T),
ncol = n_periods
)
mmu[, n_periods] <- mmu_T
for (t in seq(n_periods - 1, 1, by = -1)) {
mmu[, t] = mmu[, t + 1] +
MASS::mvrnorm(1,
mu = rep(0, nrow(W)),
Sigma = W
)
}
# convert mmu to df
dimnames(mmu) <- list(names_mmu_T, seq(1, n_periods))
mmu %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "tmp") %>%
tidyr::pivot_longer(
cols = -tmp,
names_to = "t",
values_to = "mmu"
) %>%
tidyr::separate(tmp, into = c("state", "party"), sep = "_") -> df_tmp
# fill in "missing" parties with a value of 0
df_st_pa <- data.frame()
for (s in states) {
df_st_pa <- rbind(
df_st_pa,
data.frame(
state = s,
party = parties[1:n_parties_by_state[s]]
)
)
}
do.call("rbind",
lapply(
seq(1, n_periods),
function(t) {
df_tmp <- df_st_pa
df_tmp$t = t
df_tmp$ppi = NA
df_tmp
}
)) -> df_draw
df_draw <- merge(
df_draw,
df_tmp,
by = c("state", "party", "t"),
all.x = TRUE
)
# invert log ratio transformation
df_draw %>%
mutate(mmu = ifelse(
is.na(mmu),
0,
mmu
)
) %>%
group_by(t, state) %>%
mutate(
ppi = inv_additive_log_ratio(mmu),
mmu_plus_house = mmu + rnorm(n = n(), sd = sig_ddelta),
ttheta = inv_additive_log_ratio(mmu_plus_house)
) -> df_draw
return(df_draw)
}Functions to plot \pi and \theta
plt_draws_ppi <- function(
df_draws,
plt_title,
plt_caption
) {
df_draws %>%
mutate(
values = ifelse(
values == 0.0,
NA,
values
)
) %>%
group_by(
t,
geography,
party
) %>%
summarise(
mn = mean(values, na.rm = T),
q_95_upp = quantile(
values,
prob = c(0.025),
na.rm = T
),
q_95_low = quantile(
values,
prob = c(1-0.025),
na.rm = T
),
q_83_upp = quantile(
values,
prob = c(0.0855),
na.rm = T
),
q_83_low = quantile(
values,
prob = c(1-0.085),
na.rm = T
)
) %>%
ungroup() %>%
ggplot(aes(x = t))+
geom_line(aes(y = mn, color = party), linetype = "dashed", linewidth= 1.0)+
geom_ribbon(aes(ymin = q_95_low, ymax = q_95_upp, fill = party), alpha = 0.2)+
geom_ribbon(aes(ymin = q_83_low, ymax = q_83_upp, fill = party), alpha = 0.3)+
ggsci::scale_color_jco() +
ggsci::scale_fill_jco() +
facet_wrap(~geography)+
labs(
x = "",
y = "",
title = plt_title,
caption = plt_caption
) -> p
return(p)
}
plt_house_effects <- function(
df_draws,
plt_title,
plt_caption
) {
df_draws %>%
ggplot(aes(x = house_effects, fill = party))+
geom_histogram(
bins = 100,
position = "identity",
alpha = 0.3)+
ggsci::scale_fill_jco() +
facet_wrap(~geography)+
labs(
x = "",
y = "",
title = plt_title,
caption = plt_caption
) -> p
return(p)
}Function to simulate prior for a given scenario
simulate_prior <- function(
scen,
n_draws,
n_periods
) {
do.call(
"rbind",
lapply(
seq(1, n_draws),
function(draw) {
df_draw <- simulate_mmu_ppi(
n_periods = n_periods,
priors[[scen]]$m_mmu_T,
priors[[scen]]$V_mmu_T,
priors[[scen]]$W,
priors[[scen]]$sig_ddelta
)
df_draw$draw = draw
df_draw
}
)) -> df_draws
df_draws$scenario = scen
df_draws$house_effects <- df_draws$ttheta - df_draws$ppi
# aggregate over states to get national vote
load_pop_weights("national") %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "state") %>%
rename(pop_w = ".") -> df_state_weights
merge(
df_draws,
df_state_weights,
by = "state"
) %>%
select(draw, t, state, party, ppi, pop_w) %>%
mutate(ppi_w = ppi * pop_w) %>%
summarise(
ppi_nat = sum(ppi_w),
.by = c(draw, t, party)
) -> df_draws_nat
# calculate probability of winning election for each t
df_draws <- rename(
df_draws,
values = ppi,
geography = state
)
df_draws_nat <- rename(
df_draws_nat,
values = ppi_nat
)
df_prob_win_election <- do.call(
"rbind",
lapply(
seq(1, n_periods),
FUN = calc_prob_win_election,
df_draws_ppi = df_draws,
df_draws_ppi_nat = df_draws_nat,
states = states,
parties = parties,
electoral_votes = electoral_votes
)
)
# plot ppi
plt_ppi <- plt_draws_ppi(
df_draws,
plt_title = paste0(scen, ": Expected vote share"),
plt_caption = "Simulated prior distribution. Mean: dashed line; lighter (darker) ribbon: 95 (83) percent interval."
)
# plot house effects
plt_house <- plt_house_effects(
df_draws,
plt_title = paste0(scen, ": House effects"),
plt_caption = "Simulated prior distribution: difference between theta and pi (see model description).")
# plot probability of wining election
plot_prob_win_election_over_time(
df_prob_win_election,
plt_title_prefix = scen,
plt_caption = "Simulated from the prior distribution."
) -> plt_probwin
return(list(
df_draws = df_draws,
df_draws_nat = df_draws_nat,
df_prob_win_election = df_prob_win_election,
plt_ppi = plt_ppi,
plt_house = plt_house,
plt_probwin = plt_probwin
)
)
}Loop over scenarios and simulate prior
plts <- list()
for (scen in scenarios) {
plts[[scen]] <- simulate_prior(
scen = "A",
n_periods = n_periods,
n_draws = n_draws
)
}